

/* 

The Effects of Air Pollution on Students’ Cognitive Performance: Evidence from Brazilian University Entrance Tests


Juliana Carneiro, Matthew Cole, Eric Strobl

March 2021
	
This program contains all Stata routines used to generate the Figure and Tables in the manuscript as published.	

Contact: Juliana Carneiro (julianaccp@gmail.com)

*/		


clear all 



*** TABLES ***


*--------------------------------------------------------------------
* Table 1: Summary statistics air pollution, weather and exams scores
*---------------------------------------------------------------------


use dataset1,clear 

 sum  nu_nota_cn nu_nota_ch nu_nota_lc nu_nota_mt pm o3  temp humid windsp municid 
 sum  nu_nota_cn nu_nota_ch nu_nota_lc nu_nota_mt pm o3  temp humid windsp municid if female == 1
 sum  nu_nota_cn nu_nota_ch nu_nota_lc nu_nota_mt pm o3  temp humid windsp municid if female == 0


*This Table was created manually using the values from the sum commands above.

*----------------------------------------------------------------------------
* Table 2: Summary statistics demographic information female and male samples
*----------------------------------------------------------------------------
use dataset2,clear 


*We used tab command to calculate the proportions and used excel to create the table by hand
*Race 
tab white
tab black
tab parda
tab asian 
tab indigenous
tab racenondecl

*School Type
tab federal
tab state
tab municipal
tab private
tab schtynondecl

* Mother's Education

tab mneverstud 
tab mincompl_element 
tab mincompl_ginasio 
tab mcompl_ginasio 
tab mhighshool
tab mundergrad 
tab mposgrad 
tab meducnondecl

* Father's Education

tab fneverstud 
tab fincompl_element 
tab fincompl_ginasio 
tab fcompl_ginasio 
tab fhighshool
tab fundergrad 
tab fposgrad 
tab feducnondecl


*Household Income

tab zerosal
tab umsal 
tab um_ummeiosal 
tab ummeioa2sal 
tab doisa2meiosal 
tab doismeioa3sal 
tab tresa4sal 
tab quatroa5sal 
tab cincoa6sal 
tab seisa7sal 
tab setea8sal 
tab oitoa9sal 
tab novea10sal 
tab deza12sal 
tab dozea15sal 
tab quinzea20sal 
tab mais20sal


*-----------------------------------------------------------------------------------------------------
* Table 3: The Effects of PM10 on Students’ Scores (Main Specification with flexible weather controls)
*-----------------------------------------------------------------------------------------------------


use dataset1,clear

global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq

global controls female black  parda asian indigenous racenondecl  zerosal umsal um_ummeiosal ummeioa2sal doisa2meiosal doismeioa3sal tresa4sal quatroa5sal cincoa6sal seisa7sal setea8sal oitoa9sal novea10sal deza12sal  quinzea20sal mais20sal


* 1) Municipality FE - pollution linear
eststo l1: qui:  reghdfe notasd pm_10  $weather $controls i.dia , absorb(idmunic_centroid) cluster(idmunic_centroid)

* 2) Municipality FE - pollution dummy
eststo l2: qui:  reghdfe notasd pm21  $weather $controls i.dia , absorb(idmunic_centroid) cluster(idmunic_centroid)


* 3) Students FE - pollution linear
eststo l3: qui: reghdfe notasd pm_10  $weather i.dia , absorb(id_student) cluster(idmunic_centroid)

* 4) Students FE - pollution dummy
eststo l4: qui: reghdfe notasd pm21  $weather i.dia , absorb(id_student) cluster(idmunic_centroid)
 
 estout l1 l2 l3 l4, cells(b(star fmt(2)) se(par fmt(2))) stats(r2 N) starlevels(* 0.10 ** 0.05 *** 0.01) keep(pm_10 pm21) 
 
 
 

*----------------------------------------------------------------------------------------------------
* Table 4: Particulate Matters’s Impact on Scores: Models with Instrumental Variable Wind Direction
*----------------------------------------------------------------------------------------------------

use dataset3, clear

global instwind clus1_wind90 clus2_wind90 clus3_wind90   clus1_wind180 clus2_wind180 clus3_wind180   clus1_wind270 clus2_wind270 clus3_wind270 
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq


program my2sls1
* first stage regression
	 reghdfe pm_10 $instwind $weather i.dia, absorb(id_student) cluster(idmunic_centroid)

* get predicted values
predict pm_hat, xb
* second stage regression
 reghdfe notasd pm_hat  $weather i.dia, absorb(id_student) cluster(idmunic_centroid)

drop pm_hat
end

// bootstrap the standard errors
bootstrap, reps(500): my2sls1




**  F stat  first stage

use dataset3, clear

global instwind clus1_wind90 clus2_wind90 clus3_wind90   clus1_wind180 clus2_wind180 clus3_wind180   clus1_wind270 clus2_wind270 clus3_wind270 
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq

bootstrap, reps(500): reghdfe pm_10 $instwind $weather i.dia, absorb(id_student) cluster(idmunic_centroid)
test $instwind


** POLLUTANT DUMMY 



program my2sls2
* first stage regression
	 reghdfe pm21 $instwind $weather i.dia, absorb(id_student) cluster(idmunic_centroid)

* get predicted values
predict pm21_hat, xb
* second stage regression
 reghdfe notasd pm21_hat  $weather i.dia, absorb(id_student) cluster(idmunic_centroid)

drop pm_hat
end

// bootstrap the standard errors
bootstrap, reps(500): my2sls2




**  F stat  first stage

 
bootstrap, reps(500): reghdfe pm21 $instwind $weather i.dia, absorb(id_student) cluster(idmunic_centroid)
test $instwind


* The table 4 organised in excel using the results above


*--------------------------------------------------------------------------------------
* Table 5: Heterogeneity in Particulate Matter’s Impact on Students’ Scores per Gender
*--------------------------------------------------------------------------------------

use dataset1,clear
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq



** all, female, male

*all
reghdfe notasd pm_10 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using  reg1.xls, replace dec(2) 


*female
reghdfe notasd pm_10 $weather i.dia if female == 1, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using  reg1.xls, append dec(2) 


*male
reghdfe notasd pm_10 $weather i.dia if female == 0, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using  reg1.xls, append dec(2) 



*all
reghdfe notasd pm21 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using  reg1.xls, append dec(2) 


*female
reghdfe notasd pm21 $weather  i.dia if female == 1, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using  reg1.xls, append dec(2) 


*male
reghdfe notasd pm21 $weather  i.dia if female == 0, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using  reg1.xls, append dec(2) 


* Z test and p-value from the coeff. difference calculated manually 



*-------------------------------------------------------------------------------------------------
* Table 6: Heterogeneity in Particulate Matter’s Impact on Students’ Scores across Sub-populations
*-------------------------------------------------------------------------------------------------

use dataset1, clear
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq




** PANEL A: PER SCORES
bysort  idmunic_centroid: egen p5nota = pctile(nota), p(5)
bysort  idmunic_centroid: egen p10nota = pctile(nota), p(10)
bysort  idmunic_centroid: egen p25nota = pctile(nota), p(25)
bysort  idmunic_centroid: egen p50nota = pctile(nota), p(50)
bysort  idmunic_centroid: egen p75nota = pctile(nota), p(75)
bysort  idmunic_centroid: egen p95nota = pctile(nota), p(95)


*Main
reghdfe notasd pm_10 $weather   i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, replace   dec(2) 


 * nota <50

reghdfe notasd pm_10 $weather  i.dia if nota <p50nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 

* nota >50

reghdfe notasd pm_10 $weather   i.dia if nota >p50nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


* nota <5

reghdfe notasd pm_10 $weather   i.dia if nota <p5nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using   reg5.xls, append   dec(2) 


* nota >95

reghdfe notasd pm_10 $weather   i.dia if nota >p95nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2   using   reg5.xls, append   dec(2) 



*Main
reghdfe notasd pm21 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


 * nota <50

reghdfe notasd pm21 $weather  i.dia if nota <p50nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 

* nota >50

reghdfe notasd pm21 $weather  i.dia if nota >p50nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


* nota <5

reghdfe notasd pm21 $weather  i.dia if nota <p5nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using   reg5.xls, append   dec(2) 


* nota >95

reghdfe notasd pm21 $weather  i.dia if nota >p95nota, absorb(id_student) cluster(idmunic_centroid) 
outreg2   using   reg5.xls, append   dec(2) 




** PANEL B: PER INCOME


 bysort idmunic_centroid: egen p5income = pctile(household_income), p(5)
 
 bysort idmunic_centroid: egen p10income = pctile(household_income), p(10)

 bysort idmunic_centroid: egen p25income = pctile(household_income), p(25)

 bysort idmunic_centroid: egen p50income = pctile(household_income), p(50)

 bysort idmunic_centroid: egen p75income = pctile(household_income), p(75)

 bysort idmunic_centroid: egen p95income = pctile(household_income), p(95)
 

 
 
*Main
reghdfe notasd pm_10 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


 * nota <50

reghdfe notasd pm_10 $weather  i.dia if household_income <p50income, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 

* nota >50

reghdfe notasd pm_10 $weather  i.dia if household_income >p50income, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


* nota <5

reghdfe notasd pm_10 $weather  i.dia if household_income <p5income, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using   reg5.xls, append   dec(2) 


* nota >95

reghdfe notasd pm_10 $weather  i.dia if household_income >p95income, absorb(id_student) cluster(idmunic_centroid) 
outreg2   using   reg5.xls, append   dec(2) 



*Main
reghdfe notasd pm21 $weather i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


 * nota <50

reghdfe notasd pm21 $weather  i.dia if household_income <p50income, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 

* nota >50

reghdfe notasd pm21 $weather  i.dia if household_income >p50income, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg5.xls, append   dec(2) 


* nota <5

reghdfe notasd pm21 $weather i.dia if household_income <p5income, absorb(id_student) cluster(idmunic_centroid) 
outreg2 using   reg5.xls, append   dec(2) 


* nota >95

reghdfe notasd pm21 $weather  i.dia if household_income >p95income, absorb(id_student) cluster(idmunic_centroid) 
outreg2   using   reg5.xls, append   dec(2) 


* Z test and p-value from the coeff. difference calculated manually 






*------------------------------------------------------------------------
* Table 7: The Effects of PM10 on Students’ Scores (with co-pollutant O3)
*------------------------------------------------------------------------

use dataset1, clear
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq



** with co-pollutant ozone FE
*All main - pm10
reghdfe notasd pm_10  $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using  reg2.xls, replace dec(2) 


*All main - pm21
reghdfe notasd pm21 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg2.xls, append   dec(2) 


*All with ozone - pm10
reghdfe notasd pm_10 o3_10 $weather i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using  reg2.xls, append dec(2) 


*All with ozone - pm21
reghdfe notasd pm21 o3_10 $weather  i.dia, absorb(id_student) cluster(idmunic_centroid) 
outreg2  using   reg2.xls, append   dec(2) 




*--------------------------------------------------------------------------------
* Table 8: The Effects of PM10 on Students’ Scores (Baseline Model + 1 day lead)
*--------------------------------------------------------------------------------

use dataset4, clear 
global weather temp16a19 temp19a22  temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq

*flexible weather
reghdfe notasd pm_10 fpm_10 $weather i.dia, a(id_student) cluster(idmunic_centroid)
 
reghdfe notasd fpm_10 $weather i.dia, a(id_student) cluster(idmunic_centroid)

*Table 8 made on excel by hand. 



*----------------------------------------------------------------------------------------------
*** END ****
*----------------------------------------------------------------------------------------------



*----------------------------------------------------------------------------------------------
*** FIGURES ***
*----------------------------------------------------------------------------------------------


*----------------------------------------------------------------------------------------------
* Figure 1: WSS, log(WSS), η2, and PRE for all K cluster solutions (Optimal number of clusters
* For more details: Makles, A. (2012). Stata tip 110: How to get the optimal k-means cluster solution. The Stata Journal, 12(2):347–351.
*----------------------------------------------------------------------------------------------

use css,clear

local list1 "clat clon"
       foreach v of varlist `list1' {
        egen z_`v' = std(`v')
       }
	   
	   
	   local list2 "z_clat z_clon"
       forvalues k = 1(1)11 {
       cluster kmeans z_clat z_clon, k(`k') start(random(134)) name(css`k')
         }
		 
		 
		 
		 
		 
       * WSS matrix
       matrix WSS = J(11,5,.)
       matrix colnames WSS = k WSS log(WSS) eta-squared PRE
       * WSS for each clustering
       forvalues k = 1(1)11 {
         scalar ws`k' = 0
       foreach v of varlist z_clat z_clon {
          quietly anova `v' css`k'
          scalar ws`k' = ws`k' + e(rss)
          }
         matrix WSS[`k', 1] = `k'
         matrix WSS[`k', 2] = ws`k'
         matrix WSS[`k', 3] = log(ws`k')
        matrix WSS[`k', 4] = 1 - ws`k'/WSS[1,2]
        matrix WSS[`k', 5] = (WSS[`k'-1,2] - ws`k')/WSS[`k'-1,2]
       }

	   
	   
	   matrix list WSS
	   
	   
	   
	   
local squared = char(178)

 
 _matplot WSS, columns(2 1) connect(l) xlabel(#10) name(plot1, replace) nodraw  noname
 _matplot WSS, columns(3 1) connect(l) xlabel(#10) name(plot2, replace) nodraw  noname
 _matplot WSS, columns(4 1) connect(l) xlabel(#10) name(plot3, replace) nodraw  noname ytitle({&eta}2)
 
  _matplot WSS, columns(5 1) connect(l) xlabel(#10) name(plot4, replace) nodraw  noname
      graph combine plot1 plot2 plot3 plot4, name(plot1to4, replace)

	  
	  
	  
*---------------------------------------------------------
* Figure 2: The effect of wind direction on air pollution
*---------------------------------------------------------

use dataset5, clear
gen wind = 0
replace wind = 1 if winddir90 == 1
replace wind = 2 if winddir180 == 1
replace wind = 3 if winddir270 == 1
replace wind = 4 if winddir360 == 1

preserve
collapse (mean) pmmean = pm (sd) se_pm = pm, by(wind)
gen yu = pmmean + 1.96*se_pm
gen yl = pmmean + 1.96*se_pm
label define wind 1 "N-E" 2 "E-S" 3 "S-W" 4 "W-N"
label values wind wind

set scheme s1color
serrbar pmmean se_pm wind, scale (1.96) xtitle("Wind Direction",height(6)) xlab(1(1)4) xlabel(, valuelabel) ytitle("PM10 average",height(6))



*-----------------------------------------------------------------------------------------------
* Figure 3: Cognitive impact of PM10 per distance from the station to the municipality centroid
*-----------------------------------------------------------------------------------------------

use dataset6, clear
global weather temp16a19 temp19a22   temp25a28 temp28a31 temp31mais humid  humidsq windsp temphumid tempsqhumidsq 

* generating AQI variables with names to indicate model for graph
			capture drop yy*
			capture gen yy1=pm_10
			capture gen yy2=pm_10
			capture gen yy3=pm_10
			capture gen yy4=pm_10
			capture gen yy5=pm_10
		

reghdfe notasd yy1 $weather if km==10, a(id_student dia) cluster(idmunic_centroid)
estimates store km10


reghdfe notasd yy2 $weather if km==20, a(id_student dia) cluster(idmunic_centroid)
estimates store km20

reghdfe notasd yy3 $weather if km==30, a(id_student dia) cluster(idmunic_centroid)
estimates store km30

reghdfe notasd yy4 $weather if km==40, a(id_student dia) cluster(idmunic_centroid)
estimates store km40

reghdfe notasd yy5 $weather if km==50, a(id_student dia) cluster(idmunic_centroid)
estimates store km50

coefplot (km10, label(off) offset(0)) (km20, label(off) offset(0)) (km30, label(off) offset(0)) (km40, label(off) offset(0)) (km50, label(off) offset(0)), keep(yy*) nolabels xtitle() coeflabels(yy1="10km" yy2="20km" yy3="30km" yy4="40km" yy5="50km", notick labsize(small) wrap(12) labcolor(green) labgap(1)) omitted scheme(s1color) vertical ytitle("{bf:Coefficient estimate per 10mg/m3 PM10}") xtitle("{bf:Distances}")  label(off) legend(off) yline(0) 




